Neo IL2 Complex Docking

Author

Joe Boktor, Alec Lourenço

Published

March 26, 2023

Code
@media (min-width: 30em) {
  .panel-tabset {
    display: grid;
    grid-gap: 2em;
    grid-template-columns: 25% 75%;
  }
  .panel-tabset-tabby {
    border-bottom: none !important;
    border-right: 1px solid #bbb; 
  }
  .panel-tabset [role=tab][aria-selected=true] {
    background-color: transparent;
    border: 1px solid #bbb !important;
  }
  h3 {
    margin-top: 0;
  }
}

AlphaFold Multimer Complex Docking

Analysis Setup

Code
library(tidyverse)
library(magrittr)
library(glue)
library(fs)
library(bio3d)
library(ggpackets)
library(ggpointdensity)
library(viridis)
library(ggside)
library(r3dmol)
library(seriation)
library(ggmsa)
library(patchwork)
library(gghalves)
library(ggdist)
library(protr)
library(reticulate)
# Plotting functions
homedir <- "/central/groups/MazmanianLab/joeB"
wkdir <- glue(
  "{homedir}/",
  "Microbiota-Immunomodulation/Microbiota-Immunomodulation"
)
src_dir <- glue("{wkdir}/notebooks")
source(glue("{src_dir}/R-scripts/helpers_general.R"))
docking_dir <- "/central/groups/MazmanianLab/joeB/docking"
proj_dir <- glue("{docking_dir}/projects/neo2_variants")
analysis_dir <- glue("{proj_dir}/structural_analysis")
neo_dir <- glue("{proj_dir}/ESM_PDBs")

Converting PDBs into FASTA (rec/lig pairs)

Code
pdb_paths <- list.files(neo_dir, full.names = TRUE) %>%
  grep("pnon|pmin", ., value = TRUE, invert = TRUE)
il2_variants <- pdb_paths %>% grep("MPNN|alpha", ., value = TRUE)

# IL2 RECEPTORS
Il2rb <- bio3d::read.fasta(glue("{proj_dir}/Il2rb.fasta"))
Il2rg <- bio3d::read.fasta(glue("{proj_dir}/Il2rg.fasta"))

# IL2 VARIANTS
fasta_dir <-
  "/central/groups/MazmanianLab/joeB/docking/projects/neo2_variants/fastas"
for (pdb_path in il2_variants) {
  pdb <- bio3d::read.pdb(pdb_path)
  clean_pdb <- clean.pdb(pdb,
    consecutive = TRUE, force.renumber = TRUE,
    fix.chain=TRUE, fix.aa = TRUE, rm.wat = TRUE, rm.lig = TRUE, verbose = TRUE
  )
  il2_fasta <-
    bio3d::pdbseq(
      clean_pdb,
      atom.select(clean_pdb, "calpha"),
      aa1 = TRUE
    ) %>%
    bio3d::as.fasta()

  # save fasta file with receptor and unique il2 variant
  output_fasta <- glue(
    "{fasta_dir}/IL2RBG_{basename(path_ext_remove(pdb_path))}.fasta"
  )

  bio3d::write.fasta(Il2rb,
    file = output_fasta,
    ids = "Il2rb"
  )
  bio3d::write.fasta(Il2rg,
    file = output_fasta,
    ids = "Il2rg",
    append = TRUE
  )
  bio3d::write.fasta(il2_fasta,
    file = output_fasta,
    ids = basename(path_ext_remove(pdb_path)),
    append = TRUE
  )
}

Executing multimer

Code
cluster_reports <- glue(
  "{wkdir}/.cluster_runs/",
  "{Sys.Date()}_AFMultimer_NEO-IL2-DOCKING_{rand_string()}"
)
message("\n\n CREATING:  ", cluster_reports, "\n")
shell_do(glue("mkdir -p {cluster_reports}"))

complex_fastas <-
  list.files(fasta_dir, full.names = TRUE)

for (complex_path in complex_fastas) {
  jname <- fs::path_ext_remove(basename(complex_path))
  slurm_run_alphafold(
    jobname = jname,
    slurm_out = cluster_reports,
    input_fasta = complex_path,
    walltime = "06:00:00",
    mem = "32G",
    gpus = 1,
    cpus_per_task = 8,
    output_dir = glue("/central/scratch/jbok/alphafold-multimer/2023-03-26_neo-IL2/{jname}")
  )
}

Reading in multimer model pkl results

Code
reticulate::use_condaenv(condaenv = "pdmbsR", required = TRUE)
source_python(glue("{src_dir}/python-scripts/pickle_reader.py"))

Extracting metrics of interest

Code
# list all model pkl files ----
output_dir <-
  glue(
    "/central/scratch/jbok/alphafold-multimer/2023-03-26_neo-IL2"
  )
model_pkl_paths <-
  list.files(
    output_dir, pattern = ".pkl", 
    full.names = TRUE, recursive = TRUE
  ) %>%
  keep(!grepl("features", .))

extract_pkl_results <- function(pkl_path) {
  pkl <- read_pickle_file(pkl_path)
  data_out <- list(
    pkl_path = pkl_path,
    ptm = pkl$ptm,
    iptm = pkl$iptm,
    max_predicted_aligned_error = pkl$max_predicted_aligned_error,
    ranking_confidence = pkl$ranking_confidence
  )
  return(data_out)
}

performance_results <- 
  purrr::map_dfr(model_pkl_paths, ~ extract_pkl_results(.))
saveRDS(
  performance_results,
  glue("{proj_dir}/multimer_performance_results.rds")
)

Visualizing model iPTM scores

Code
performance_results <- readRDS(
  glue("{proj_dir}/multimer_performance_results.rds")
)
peformance_df <-
  performance_results %>%
  mutate(
    complex = basename(dirname(pkl_path)),
    model_name = basename(pkl_path),
    ranking_score = ptm + iptm
  )
  
p_iptms <- peformance_df %>%
  ggplot(aes(x = fct_reorder(complex, iptm), y = iptm)) +
  ggdist::stat_interval(
    .width = c(.25, 0.5, .75, 1),
    show.legend = T,
    height = 0.0001
  ) +
  gghalves::geom_half_point(
    side = "l", range_scale = .3, alpha = 0.8,
    shape = 21, color = "black", size = 1.5
  ) +
  scale_color_brewer(palette = "PuBuGn") +
  theme_light() +
  coord_flip() +
  labs(y = "iPTM", x = NULL, color = "interval") +
  theme(
    panel.grid = element_blank(),
    legend.position = "top"
  )

ggsave(
  glue("{proj_dir}/structural_analysis/figures/iptm.png"),
  p_iptms, width = 5, height = 7, dpi = 600
  )

Figure 1: iPTM distributions

Comparative Structural Analysis

Generating 3D visuals

Code
af_pdb_paths <- list.files(
  "/central/scratch/jbok/alphafold-multimer/2023-03-26_neo-IL2",
  full.names = TRUE, recursive = TRUE, pattern = "ranked_0.pdb"
)

pdb_list <- list()
af_models <- list()
for (af_model in af_pdb_paths) {
  pair <- basename(dirname(dirname(af_model)))
  message("Processing: ", pair)
  pdb_list[[pair]] <- bio3d::read.pdb(af_model)
  af_models[[pair]] <- r3dmol(
    viewer_spec = m_viewer_spec(
      cartoonQuality = 10,
      lowerZoomLimit = 250,
      upperZoomLimit = 2000
    )
  ) %>%
    m_add_model(data = m_bio3d(pdb_list[[pair]])) %>%
    m_add_surface(style = m_style_surface(opacity = 0.4)) %>%
    m_set_style(style = m_style_cartoon()) %>%
    m_set_style(
      sel = m_sel(chain = "A"),
      style = m_style_cartoon(color = "#FC027E")
    ) %>%
    m_set_style(
      sel = m_sel(chain = "B"),
      style = m_style_cartoon(color = "#02bdfc")
    ) %>%
    m_set_style(
      sel = m_sel(chain = "C"),
      style = m_style_cartoon(color = "#02fc8b")
    ) %>% 
    m_zoom_to()
}

saveRDS(pdb_list, glue("{analysis_dir}/af_ranked-0-PDBs.rds"))
saveRDS(af_models, glue("{analysis_dir}/af_r3dmol_models.rds"))
Code
af_pdb_paths <- list.files(
  "/central/scratch/jbok/alphafold-multimer/2023-03-26_neo-IL2",
  full.names = TRUE, recursive = TRUE, pattern = "ranked_0.pdb"
)
names(af_pdb_paths) <- gsub("IL2RBG_", "", basename(dirname(af_pdb_paths)))
Code
af_models <- readRDS(glue("{analysis_dir}/af_r3dmol_models.rds"))
pdb_list <- readRDS(glue("{analysis_dir}/af_ranked-0-PDBs.rds"))

Aligning PDB files and building MSA

Code
pdbs <- pdbaln(
  files = pdb_list,
  outfile = glue("{analysis_dir}/aln.fa")
)
# MSA
msa_seq_labels <- names(pdb_list) %>% gsub("IL2RBG_", "", .)
names(msa_seq_labels) <- paste0("seq", 1:length(msa_seq_labels))

Visualizing MSA

Code
p_msa <-
  ggmsa(
  glue("{analysis_dir}/aln.fa"),
  start = 408, end = 500,
  char_width = 0.5, seq_name = TRUE) +
  scale_y_discrete(labels = msa_seq_labels) +
  geom_seqlogo() +
  geom_msaBar()

ggsave(
  glue("{analysis_dir}/figures/msa.png"),
  plot = p_msa, width = 20, height = 10, dpi = 600
  )

Figure 2: Neo IL2 MSA

Visualizing pairwise structural and sequence level similarity across complexes

Code
# Calculate sequence identity
pdbs$id <- names(af_pdb_paths)
pairwise_seqid <- seqidentity(pdbs)
feature_order <-
  dist(x = pairwise_seqid, method = "euclidean") %>%
  seriate(method = "OLO") %>%
  get_order()
ranked_features_seqid <- rownames(pairwise_seqid)[feature_order]

pairwise_seqid_df <- pairwise_seqid %>%
  as.data.frame() %>%
  rownames_to_column("pdb1") %>%
  pivot_longer(!pdb1, names_to = "pdb2", values_to = "seqid") %>%
  mutate(
    pdb1 = factor(pdb1, ordered = TRUE, levels = ranked_features_seqid),
    pdb2 = factor(pdb2, ordered = TRUE, levels = ranked_features_seqid)
  )

# organize by sequence ID
p_pairwise_seq <- pairwise_seqid_df %>%
  ggplot(aes(x = pdb1, y = pdb2, fill = seqid)) +
  geom_tile() +
  theme_light() +
  labs(fill = "Sequence Identity (%)", x = NULL, y = NULL) +
  scale_fill_viridis_c(option = "F") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top",
    legend.key.width = unit(1, 'cm')
  )

## Calculate RMSD
pairwise_rmsd <- rmsd(pdbs, fit=TRUE)
feature_order_rmsd <-
  dist(x = pairwise_rmsd, method = "euclidean") %>%
  seriate(method = "OLO") %>%
  get_order()
ranked_features_rmsd <- rownames(pairwise_seqid)[feature_order_rmsd]

pairwise_rmsd_df <- pairwise_rmsd %>%
  as.data.frame() %>%
  rownames_to_column("pdb1") %>%
  pivot_longer(!pdb1, names_to = "pdb2", values_to = "rmsd") %>%
  mutate(
    pdb1 = factor(pdb1, ordered = TRUE, levels = ranked_features_rmsd),
    pdb2 = factor(pdb2, ordered = TRUE, levels = ranked_features_rmsd)
  )

# organize by sequence ID
p_pairwise_rmsd <- pairwise_rmsd_df  %>%
  ggplot(aes(x = pdb1, y = pdb2, fill = rmsd)) +
  geom_tile() +
  theme_light() +
  labs(fill = "RMSD", x = NULL, y = NULL) +
  scale_fill_viridis_c(option = "F") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top",
    legend.key.width = unit(1, 'cm')
  )

patch <- p_pairwise_seq + p_pairwise_rmsd + plot_annotation(tag_levels = "A")
patch
ggsave(
  glue("{analysis_dir}/figures/pairwise_SeqID_RMSD.png"),
  patch, width = 12, height = 6, dpi = 600
)

Figure 3: Pairwise Complex Similarity using Percent Sequence Similarity A) and RMSD B)

Code
af_models$`IL2RBG_neo2-15alpha`
Code
af_models$IL2RBG_neoMPNN2
Code
af_models$IL2RBG_neoMPNN3
Code
af_models$IL2RBG_neoMPNN4
Code
af_models$IL2RBG_neoMPNN6
Code
af_models$IL2RBG_neoMPNN7
Code
af_models$IL2RBG_neoMPNN14
Code
af_models$IL2RBG_neoMPNN17

2023-06-10 Docking

Split fasta file of neo2 variants into individual fasta files with IL2RB and IL2RG sequences

Code
# FASTA file of NEO2 variants
neo2_vars <- bio3d::read.fasta(glue("{proj_dir}/2023-06-06_20neo2s.fasta"))
# IL2 RECEPTORS
Il2rb <- bio3d::read.fasta(glue("{proj_dir}/Il2rb.fasta"))
Il2rg <- bio3d::read.fasta(glue("{proj_dir}/Il2rg.fasta"))

# Output dir of complex fastas
fasta_dir <-
  "/central/groups/MazmanianLab/joeB/docking/projects/neo2_variants/2023-06-13_fastas"
dir.create(fasta_dir, showWarnings = FALSE)

for (f in 1:length(neo2_vars$id)) {
  # save fasta file with receptor and unique il2 variant
  var_name <- strex::str_before_first(neo2_vars$id[f], "_")
  # seq <- bio3d::as.fasta(paste(neo2_vars$ali[f, ], collapse = ""))
  seq <- bio3d::as.fasta(neo2_vars$ali[f, ])
  output_fasta <- glue(
    "{fasta_dir}/IL2RBG_{var_name}.fasta"
  )
  bio3d::write.fasta(Il2rb,
    file = output_fasta,
    ids = "Il2rb"
  )
  bio3d::write.fasta(Il2rg,
    file = output_fasta,
    ids = "Il2rg",
    append = TRUE
  )
  bio3d::write.fasta(
    alignment = seq,
    file = output_fasta,
    ids = var_name,
    append = TRUE
  )
}

Executing multimer

Code
cluster_reports <- glue(
  "{wkdir}/.cluster_runs/",
  "{Sys.Date()}_AFMultimer_NEO-IL2-DOCKING_{rand_string()}"
)
message("\n\n CREATING:  ", cluster_reports, "\n")
shell_do(glue("mkdir -p {cluster_reports}"))

complex_fastas <-
  list.files(fasta_dir, full.names = TRUE)

for (complex_path in complex_fastas) {
  jname <- fs::path_ext_remove(basename(complex_path))
  slurm_run_alphafold(
    jobname = jname,
    slurm_out = cluster_reports,
    input_fasta = complex_path,
    walltime = "06:00:00",
    mem = "32G",
    gpus = 1,
    cpus_per_task = 8,
    output_dir = glue("/central/scratch/jbok/alphafold-multimer/2023-06-13_neo-IL2/{jname}")
  )
}

Reading in multimer model pkl results

Code
use_condaenv("/home/jboktor/miniconda3/envs/pdmbsR/bin/python", required = TRUE)
source_python(glue("{src_dir}/python-scripts/pickle_reader.py"))

Extracting metrics of interest

Code
# list all model pkl files ----
output_dir <-
  glue(
    "/central/scratch/jbok/alphafold-multimer/2023-06-13_neo-IL2"
  )
model_pkl_paths <-
  list.files(
    output_dir, pattern = ".pkl", 
    full.names = TRUE, recursive = TRUE
  ) %>%
  keep(!grepl("features", .))

extract_pkl_results <- function(pkl_path) {
  pkl <- read_pickle_file(pkl_path)
  data_out <- list(
    pkl_path = pkl_path,
    ptm = pkl$ptm,
    iptm = pkl$iptm,
    max_predicted_aligned_error = pkl$max_predicted_aligned_error,
    ranking_confidence = pkl$ranking_confidence
  )
  return(data_out)
}

performance_results <- 
  purrr::map_dfr(model_pkl_paths, ~ extract_pkl_results(.))
saveRDS(
  performance_results,
  glue("{proj_dir}/{Sys.Date()}_multimer_performance_results.rds")
)

Visualizing model iPTM scores

Code
performance_results <- readRDS(
  glue("{proj_dir}/2023-06-14_multimer_performance_results.rds")
)
peformance_df <-
  performance_results %>%
  mutate(
    complex = basename(dirname(pkl_path)),
    model_name = basename(pkl_path),
    ranking_score = ptm + iptm
  )
  
p_iptms <- peformance_df %>%
  ggplot(aes(x = fct_reorder(complex, iptm), y = iptm)) +
  ggdist::stat_interval(
    .width = c(.25, 0.5, .75, 1),
    show.legend = T,
    height = 0.0001
  ) +
  gghalves::geom_half_point(
    side = "l", range_scale = .3, alpha = 0.8,
    shape = 21, color = "black", size = 1.5
  ) +
  scale_color_brewer(palette = "PuBuGn") +
  theme_light() +
  coord_flip() +
  labs(y = "iPTM", x = NULL, color = "interval") +
  theme(
    panel.grid = element_blank(),
    legend.position = "top"
  )

ggsave(
  glue("{proj_dir}/structural_analysis/figures/{Sys.Date()}_iptm.png"),
  p_iptms, width = 5, height = 7, dpi = 600
  )

Figure 4: iPTM distributions

Comparative Structural Analysis

Code
align_pdb_structures <- function(pdb_path_list) {
  color_pal <- viridis::viridis_pal(option = "H")(length(pdb_path_list)) %>% 
    strex::str_before_last(., "FF")
  # use first pdb in path as the base model
  base_pdb <- bio3d::read.pdb(pdb_path_list[1])
  base_model <- r3dmol() %>%
    m_add_model(data = m_bio3d(base_pdb)) %>%
    m_set_style(sel = m_sel(model = 0),
                style = m_style_cartoon(color = color_pal[1]))

  for (f in 2:length(pdb_path_list)) {
    message("\nProcessing file: ", f, " of ", 
      length(pdb_path_list), " : ", pdb_path_list[f])
    newpdb <- bio3d::read.pdb(pdb_path_list[f])
    pdbs_aln <- bio3d::struct.aln(base_pdb,
                                  newpdb,
                                  exefile = "msa",
                                  max.cycles = 100)
    newpdb$xyz <- pdbs_aln$xyz
    model_n <- f-1
    base_model %<>%
      m_add_model(data = m_bio3d(newpdb)) %>%
      m_set_style(sel = m_sel(model = model_n),
                  style = m_style_cartoon(color = color_pal[f])) %>%
      m_zoom_to()
  }
  return(base_model)
}

Generating 3D visuals

Code
af_pdb_paths <- list.files(
  "/central/scratch/jbok/alphafold-multimer/2023-06-13_neo-IL2",
  full.names = TRUE, recursive = TRUE, pattern = "ranked_0.pdb"
)
aligned_pdbs <- align_pdb_structures(pdb_path_list = af_pdb_paths)
saveRDS(aligned_pdbs, glue("{analysis_dir}/{Sys.Date()}_aligned-PDBs.rds"))

pdb_list <- af_pdb_paths %>% 
  purrr::set_names() %>% 
  purrr::map( ~ bio3d::read.pdb(.))
saveRDS(pdb_list, glue("{analysis_dir}/{Sys.Date()}_af_ranked-0-PDBs.rds"))
Code
aligned_pdbs <- readRDS(glue("{analysis_dir}/2023-06-14_aligned-PDBs.rds"))
aligned_pdbs %>%
   m_rotate(angle = 90, axis = "y") %>%
    m_spin()
Code
sessionInfo()
R version 4.2.0 (2022-04-22)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /central/groups/MazmanianLab/joeB/software/mambaforge/envs/pdmbsR/lib/libopenblasp-r0.3.23.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reticulate_1.30-9000 protr_1.6-3          ggdist_3.3.0        
 [4] gghalves_0.1.4       patchwork_1.1.2      ggmsa_1.4.0         
 [7] seriation_1.4.2      r3dmol_0.1.2         ggside_0.2.2        
[10] viridis_0.6.3        viridisLite_0.4.2    ggpointdensity_0.1.0
[13] ggpackets_0.2.1      bio3d_2.4-4          fs_1.6.2            
[16] glue_1.6.2           magrittr_2.0.3       lubridate_1.9.2     
[19] forcats_1.0.0        stringr_1.5.0        dplyr_1.1.2         
[22] purrr_1.0.1          readr_2.1.4          tidyr_1.3.0         
[25] tibble_3.2.1         ggplot2_3.4.2        tidyverse_2.0.0     

loaded via a namespace (and not attached):
 [1] colorspace_2.1-0       ggtree_3.6.2           ellipsis_0.3.2        
 [4] XVector_0.38.0         aplot_0.1.10           farver_2.1.1          
 [7] fansi_1.0.4            codetools_0.2-19       extrafont_0.19        
[10] knitr_1.42             polyclip_1.10-4        jsonlite_1.8.5        
[13] Rttf2pt1_1.3.12        png_0.1-8              shiny_1.7.4           
[16] ggforce_0.4.1          compiler_4.2.0         backports_1.4.1       
[19] Matrix_1.5-4.1         fastmap_1.1.1          lazyeval_0.2.2        
[22] cli_3.6.1              later_1.3.1            tweenr_2.0.2          
[25] htmltools_0.5.5        tools_4.2.0            gtable_0.3.3          
[28] GenomeInfoDbData_1.2.9 maps_3.4.1             Rcpp_1.0.10           
[31] vctrs_0.6.2            Biostrings_2.66.0      ggalt_0.4.0           
[34] ape_5.7-1              nlme_3.1-162           extrafontdb_1.0       
[37] iterators_1.0.14       xfun_0.39              mime_0.12             
[40] timechange_0.2.0       lifecycle_1.0.3        ca_0.71.1             
[43] zlibbioc_1.44.0        MASS_7.3-60            scales_1.2.1          
[46] TSP_1.2-4              promises_1.2.0.1       hms_1.1.3             
[49] parallel_4.2.0         proj4_1.0-12           RColorBrewer_1.1-3    
[52] yaml_2.3.7             strex_1.6.0            R4RNA_1.26.0          
[55] gridExtra_2.3          ggfun_0.0.9            seqmagick_0.1.5       
[58] yulab.utils_0.0.6      stringi_1.7.12         S4Vectors_0.36.2      
[61] foreach_1.5.2          tidytree_0.4.2         checkmate_2.2.0       
[64] BiocGenerics_0.44.0    GenomeInfoDb_1.34.9    rlang_1.1.1           
[67] pkgconfig_2.0.3        bitops_1.0-7           distributional_0.3.2  
[70] evaluate_0.21          lattice_0.21-8         treeio_1.22.0         
[73] htmlwidgets_1.6.2      tidyselect_1.2.0       R6_2.5.1              
[76] IRanges_2.32.0         generics_0.1.3         pillar_1.9.0          
[79] withr_2.5.0            RCurl_1.98-1.12        ash_1.0-15            
[82] crayon_1.5.2           KernSmooth_2.23-21     utf8_1.2.3            
[85] tzdb_0.4.0             rmarkdown_2.22         grid_4.2.0            
[88] digest_0.6.31          xtable_1.8-4           httpuv_1.6.11         
[91] gridGraphics_0.5-1     stats4_4.2.0           munsell_0.5.0         
[94] registry_0.5-1         ggplotify_0.1.0